package com.kdtech.analyse.news;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.AnalyseNews;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.StringUtils;

/**
 * SdChinaNewsAnalyse 中国山东网新闻解析类
 *  xiaonie 2012-11-22
 */
public class SdChinaNewsAnalyse implements AnalyseNews {
	private final String[] regexs={ "http://[a-z]*[.]sdchina.com/show/[0-9]+.html",
			"http://[a-z]*[.]sdchina.com/minsheng/[0-9]+.html",
			"http://[a-z]*[.]sdchina.com/news/[0-9]+.html",
			"http://[a-z]*[.]sdchina.com/news/[0-9]{6}/[0-9]+.html",
			"http://sports.sdchina.com/show/[0-9]*/[0-9]+.html"};

	
	public boolean isDetailPage(String url) {
		if(url.startsWith("http://english.sdchina.com")){
			return false;
		}
		for (String regex : regexs) {
			if (url.matches(regex)) {
				return true;
			}
		}
		return false;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta news=new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt=urlMeta.getHtml();
		String url=urlMeta.getUrl();
		
		if (!isDetailPage(url)) {
		}
		Document doc=Jsoup.parse(htmltxt);
		String title=null;
		String content=null;
		Long date=null;
		String dateStr=null;
		
		if (htmltxt != null && htmltxt != "") {
			news.setUrl(url);
			doc.select("div.entmain div.text table a").remove();
			if(url.startsWith("http://city.sdchina.com/show/") || url.startsWith("http://leisure.sdchina.com/show/")){
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.digtext"));
				dateStr=doc.select("li.height25").html();
			}else if(url.startsWith("http://finance.sdchina.com/show/")){
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.mar_top15.text.width_b95"));
				dateStr=doc.select("li.height25").html();
			}else if(url.startsWith("http://edu.sdchina.com/show/")){
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.mar_top15.text"));
				dateStr=doc.select("li.height25").html();
			}else if(url.startsWith("http://news.sdchina.com/minsheng/")){
				title=doc.select("h2").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div[style=width:94%;margin:0px auto;text-align: left; font-size: 14px;]"));
				dateStr=doc.select("span.fa4_1").text();
				NewsMeta update=Update(news);
				if(update!=null){
					news.setCommentNum(update.getCommentNum());
				}
				news.setUpdateUrl(url);
			}else if(url.startsWith("http://agri.sdchina.com/show/")){
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.text"));
				dateStr=doc.select("li.height25").html();
			}else if(url.startsWith("http://auto.sdchina.com/news/")){
				title=doc.select("div.display_title").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.display_content"));
				dateStr=doc.select("div.Tags").text();
			}else if(url.startsWith("http://gangchang.sdchina.com/")){
				title=doc.select("div.slefte div.sleftea h2").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.slefte div.slefteb"));
				dateStr=doc.select("div.slefte div.sleftea h3").text();
			}else{
				title=doc.select("h1").text();
				content=HtmlCleaner.getContentHtml(url,doc.select("div.text.size14"));
				dateStr=doc.select("li.height25").html();
			}
			date=DateUtils.matchDate(dateStr);
			if(StringUtils.isBlank(title)){
				title=doc.select("h3").text();
			}
			if (date==null){
				date=DateUtils.matchDate(doc.select("h4").text());
				if (date==null){
					date=DateUtils.matchDate(doc.select("div.digArticle h3").text());
					if (date==null){
						date=DateUtils.matchDate(doc.select("span#labDate").text());
					}
				}
			}
			if(StringUtils.isBlank(content)){
				content=HtmlCleaner.getContentHtml(url,doc.select("div.cleftfa"));
				if(StringUtils.isBlank(content)){
					content=HtmlCleaner.getContentHtml(url,doc.select("div#nrmain"));
				}
			}
			news.setContent(content);
			news.setTitle(title);
			news.setClickNum(null);
			news.setDate(date);
			news.setType(1);
			
			String author=doc.select("div.entmain div.digArticle ul.fa4_a li.height25 span").text();
			if(author!=null){
				author=author.replace("来源：", "");
			}
			news.setAuthor(author);
		}
		if(StringUtils.isBlank(title)||date==null){
		}
		return news;
	}

	public static void main(String[] args) {
		SdChinaNewsAnalyse a=new SdChinaNewsAnalyse();
		String url="http://meirong.sdchina.com/show/2941137.html";
		UrlMeta meta=CrawlHTML.responseToURL(url);
		System.out.println(a.isDetailPage(url));
		NewsMeta parserHtml=a.parserHtml(meta);
		System.out.println(parserHtml);
	}

	
	public NewsMeta Update(NewsMeta meta) {
		if(meta!=null){
			UrlMeta meta1=CrawlHTML.responseToURL(meta.getUrl());
			if (meta1.getHtml() == null) {
				return null;
			}
			String htmltxt=meta1.getHtml();
			Document doc=Jsoup.parse(htmltxt);
			Integer commentNum=0;
			try {
				commentNum=Integer.parseInt(doc.select("span#ReplyrControl1_lblReplyTotalCount").text());
			} catch (Exception e) {
			}
			meta.setCommentNum(commentNum);
			return meta;
		}
		return null;
	}
	
	public boolean isNeedUpdate(){
		return false;
	}
}
